rm(list = ls())
setwd("/Users/John/Dropbox/")

# --- Load Required Packages ---
library(readr)
library(readxl)
library(dplyr)
library(stringr)

# --- Load and Clean Provincial Output Data ---
df <- read_csv("JOP_Replication_Materials/data/raw/provincial_output.csv", show_col_types = FALSE) %>%
  filter(!is.na(Year)) %>%
  dplyr::select(1:5, 7) %>%
  rename(
    year = 1, industry = 2, csic4 = 3, region = 4,
    num_enterprises = 5, output = 6
  ) %>%
  mutate(csic4 = str_pad(csic4, 4, pad = "0"))

# --- Extract National Totals ---
national <- df %>%
  filter(region == "National Total") %>%
  dplyr::select(year, csic4, output) %>%
  rename(national_total = output)

# --- Merge National with Provincial Data ---
df <- filter(df, region != "National Total")

merged <- df %>%
  left_join(national, by = c("year", "csic4"), relationship = "many-to-many") %>%
  distinct(year, csic4, region, .keep_all = TRUE)

# --- Compute Output Share and HHI ---
merged <- merged %>%
  group_by(year, csic4) %>%
  mutate(
    share_output = output / national_total,
    hhi = sum(share_output^2, na.rm = TRUE)
  ) %>%
  filter(!is.na(share_output), !is.nan(share_output))

# --- Filter to Clean Groupings (Share ≈ 1) ---
merged <- merged %>%
  group_by(year, csic4) %>%
  mutate(sum_share = sum(share_output)) %>%
  filter(sum_share > 0.97, sum_share <= 1) %>%
  distinct(year, csic4, .keep_all = TRUE)

# --- Load ISIC-CSIC Correspondence Table ---
isic_map <- read_csv("JOP_Replication_Materials/data/raw/cic_isic_correspondence_table.csv", show_col_types = FALSE) %>%
  mutate(
    isic = str_pad(isic, 4, pad = "0"),
    cic = str_pad(cic, 4, pad = "0")
  )

# --- Merge with ISIC and Calculate Median by ISIC ---
final <- merged %>%
  left_join(isic_map, by = c("csic4" = "cic"), relationship = "many-to-many") %>%
  filter(!is.na(isic)) %>%
  group_by(isic) %>%
  mutate(median_hhi_isic = median(hhi, na.rm = TRUE)) %>%
  ungroup() %>%
  distinct(isic, year, .keep_all = TRUE) %>%
  dplyr::select(year, isic, median_hhi_isic)

# --- Write Output ---
write_csv(final, "JOP_Replication_Materials/data/processed/hhi_final.csv")
